In [273]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
plt.style.use ("dark_background")
In [274]:
dataset = pd.read_excel('Z-Alizadeh sani dataset.xlsx')
dataset
Out[274]:
Age Weight Length Sex BMI DM HTN Current Smoker EX-Smoker FH ... K Na WBC Lymph Neut PLT EF-TTE Region RWMA VHD Cath
0 53 90 175 Male 29.387755 0 1 1 0 0 ... 4.7 141 5700 39 52 261 50 0 N Cad
1 67 70 157 Fmale 28.398718 0 1 0 0 0 ... 4.7 156 7700 38 55 165 40 4 N Cad
2 54 54 164 Male 20.077335 0 0 1 0 0 ... 4.7 139 7400 38 60 230 40 2 mild Cad
3 66 67 158 Fmale 26.838648 0 1 0 0 0 ... 4.4 142 13000 18 72 742 55 0 Severe Normal
4 50 87 153 Fmale 37.165193 0 1 0 0 0 ... 4.0 140 9200 55 39 274 50 0 Severe Normal
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
298 58 84 168 Male 29.761905 0 0 0 0 0 ... 4.8 146 8500 34 58 251 45 0 N Cad
299 55 64 152 Fmale 27.700831 0 0 0 0 0 ... 4.0 139 11400 16 80 377 40 0 mild Normal
300 48 77 160 Fmale 30.078125 0 1 0 0 1 ... 4.0 140 9000 35 55 279 55 0 N Normal
301 57 90 159 Fmale 35.599858 1 0 0 0 0 ... 3.8 141 3800 48 40 208 55 0 N Normal
302 56 85 170 Fmale 29.411765 0 1 1 0 0 ... 4.4 147 6000 32 55 302 55 0 N Cad

303 rows × 56 columns

In [275]:
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 56 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    303 non-null    int64  
 1   Weight                 303 non-null    int64  
 2   Length                 303 non-null    int64  
 3   Sex                    303 non-null    object 
 4   BMI                    303 non-null    float64
 5   DM                     303 non-null    int64  
 6   HTN                    303 non-null    int64  
 7   Current Smoker         303 non-null    int64  
 8   EX-Smoker              303 non-null    int64  
 9   FH                     303 non-null    int64  
 10  Obesity                303 non-null    object 
 11  CRF                    303 non-null    object 
 12  CVA                    303 non-null    object 
 13  Airway disease         303 non-null    object 
 14  Thyroid Disease        303 non-null    object 
 15  CHF                    303 non-null    object 
 16  DLP                    303 non-null    object 
 17  BP                     303 non-null    int64  
 18  PR                     303 non-null    int64  
 19  Edema                  303 non-null    int64  
 20  Weak Peripheral Pulse  303 non-null    object 
 21  Lung rales             303 non-null    object 
 22  Systolic Murmur        303 non-null    object 
 23  Diastolic Murmur       303 non-null    object 
 24  Typical Chest Pain     303 non-null    int64  
 25  Dyspnea                303 non-null    object 
 26  Function Class         303 non-null    int64  
 27  Atypical               303 non-null    object 
 28  Nonanginal             303 non-null    object 
 29  Exertional CP          303 non-null    object 
 30  LowTH Ang              303 non-null    object 
 31  Q Wave                 303 non-null    int64  
 32  St Elevation           303 non-null    int64  
 33  St Depression          303 non-null    int64  
 34  Tinversion             303 non-null    int64  
 35  LVH                    303 non-null    object 
 36  Poor R Progression     303 non-null    object 
 37  BBB                    303 non-null    object 
 38  FBS                    303 non-null    int64  
 39  CR                     303 non-null    float64
 40  TG                     303 non-null    int64  
 41  LDL                    303 non-null    int64  
 42  HDL                    303 non-null    float64
 43  BUN                    303 non-null    int64  
 44  ESR                    303 non-null    int64  
 45  HB                     303 non-null    float64
 46  K                      303 non-null    float64
 47  Na                     303 non-null    int64  
 48  WBC                    303 non-null    int64  
 49  Lymph                  303 non-null    int64  
 50  Neut                   303 non-null    int64  
 51  PLT                    303 non-null    int64  
 52  EF-TTE                 303 non-null    int64  
 53  Region RWMA            303 non-null    int64  
 54  VHD                    303 non-null    object 
 55  Cath                   303 non-null    object 
dtypes: float64(5), int64(29), object(22)
memory usage: 132.7+ KB
In [276]:
dataset.isna().sum()
Out[276]:
Age                      0
Weight                   0
Length                   0
Sex                      0
BMI                      0
DM                       0
HTN                      0
Current Smoker           0
EX-Smoker                0
FH                       0
Obesity                  0
CRF                      0
CVA                      0
Airway disease           0
Thyroid Disease          0
CHF                      0
DLP                      0
BP                       0
PR                       0
Edema                    0
Weak Peripheral Pulse    0
Lung rales               0
Systolic Murmur          0
Diastolic Murmur         0
Typical Chest Pain       0
Dyspnea                  0
Function Class           0
Atypical                 0
Nonanginal               0
Exertional CP            0
LowTH Ang                0
Q Wave                   0
St Elevation             0
St Depression            0
Tinversion               0
LVH                      0
Poor R Progression       0
BBB                      0
FBS                      0
CR                       0
TG                       0
LDL                      0
HDL                      0
BUN                      0
ESR                      0
HB                       0
K                        0
Na                       0
WBC                      0
Lymph                    0
Neut                     0
PLT                      0
EF-TTE                   0
Region RWMA              0
VHD                      0
Cath                     0
dtype: int64
In [277]:
sns.heatmap(dataset.isna())
Out[277]:
<Axes: >
In [278]:
dataset.describe()
Out[278]:
Age Weight Length BMI DM HTN Current Smoker EX-Smoker FH BP ... ESR HB K Na WBC Lymph Neut PLT EF-TTE Region RWMA
count 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 ... 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000
mean 58.897690 73.831683 164.716172 27.248339 0.297030 0.590759 0.207921 0.033003 0.158416 129.554455 ... 19.462046 13.153465 4.230693 140.996700 7562.046205 32.399340 60.148515 221.488449 47.231023 0.620462
std 10.392278 11.987358 9.327661 4.098865 0.457706 0.492507 0.406491 0.178941 0.365734 18.938105 ... 15.936475 1.610452 0.458202 3.807885 2413.739323 9.972592 10.182493 60.796199 8.927194 1.132531
min 30.000000 48.000000 140.000000 18.115413 0.000000 0.000000 0.000000 0.000000 0.000000 90.000000 ... 1.000000 8.900000 3.000000 128.000000 3700.000000 7.000000 32.000000 25.000000 15.000000 0.000000
25% 51.000000 65.000000 158.000000 24.514380 0.000000 0.000000 0.000000 0.000000 0.000000 120.000000 ... 9.000000 12.200000 3.900000 139.000000 5800.000000 26.000000 52.500000 183.500000 45.000000 0.000000
50% 58.000000 74.000000 165.000000 26.775510 0.000000 1.000000 0.000000 0.000000 0.000000 130.000000 ... 15.000000 13.200000 4.200000 141.000000 7100.000000 32.000000 60.000000 210.000000 50.000000 0.000000
75% 66.000000 81.000000 171.000000 29.411765 1.000000 1.000000 0.000000 0.000000 0.000000 140.000000 ... 26.000000 14.200000 4.500000 143.000000 8800.000000 39.000000 67.000000 250.000000 55.000000 1.000000
max 86.000000 120.000000 188.000000 40.900658 1.000000 1.000000 1.000000 1.000000 1.000000 190.000000 ... 90.000000 17.600000 6.600000 156.000000 18000.000000 60.000000 89.000000 742.000000 60.000000 4.000000

8 rows × 34 columns

In [279]:
sns.heatmap(dataset.describe())
Out[279]:
<Axes: >
In [280]:
dataset.corr()
/tmp/ipykernel_9901/2191645083.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  dataset.corr()
Out[280]:
Age Weight Length BMI DM HTN Current Smoker EX-Smoker FH BP ... ESR HB K Na WBC Lymph Neut PLT EF-TTE Region RWMA
Age 1.000000 -0.264585 -0.163753 -0.161414 0.072543 0.246690 -0.143879 0.076608 -0.183900 0.215527 ... 0.183127 -0.161018 0.154203 -0.071886 0.020398 -0.171529 0.173030 -0.049500 -0.140512 0.108663
Weight -0.264585 1.000000 0.460631 0.725005 -0.003531 -0.028532 0.157385 0.068977 0.021963 -0.025930 ... -0.139314 0.274218 -0.018287 0.013916 -0.020214 0.020120 -0.051093 -0.094192 0.026789 -0.007648
Length -0.163753 0.460631 1.000000 -0.269356 -0.052318 -0.153668 0.335248 0.079034 0.004488 -0.072511 ... -0.222182 0.341028 -0.086277 -0.066590 0.066658 -0.156436 0.115075 -0.133446 -0.093295 0.095715
BMI -0.161414 0.725005 -0.269356 1.000000 0.045360 0.091652 -0.089398 0.005016 0.014045 0.031916 ... 0.023259 0.031107 0.044587 0.067402 -0.074928 0.139583 -0.140037 -0.003964 0.093903 -0.079739
DM 0.072543 -0.003531 -0.052318 0.045360 1.000000 0.217864 -0.208458 -0.120087 -0.064434 0.128010 ... 0.190397 -0.156382 0.100064 -0.083030 0.110345 0.033413 -0.024417 0.051054 -0.052507 0.064891
HTN 0.246690 -0.028532 -0.153668 0.091652 0.217864 1.000000 -0.169000 0.041045 -0.098467 0.570418 ... 0.161704 -0.115935 0.011826 0.036355 -0.069374 0.017204 -0.025476 -0.043840 0.031365 -0.000372
Current Smoker -0.143879 0.157385 0.335248 -0.089398 -0.208458 -0.169000 1.000000 -0.094652 0.089532 -0.079115 ... -0.121199 0.216144 -0.016599 0.036812 0.046205 -0.053224 0.042115 -0.048473 -0.068943 0.078479
EX-Smoker 0.076608 0.068977 0.079034 0.005016 -0.120087 0.041045 -0.094652 1.000000 -0.080152 0.028781 ... 0.017858 -0.024528 -0.004318 -0.038716 -0.057655 0.070524 -0.068122 -0.079102 -0.015153 0.012996
FH -0.183900 0.021963 0.004488 0.014045 -0.064434 -0.098467 0.089532 -0.080152 1.000000 -0.082999 ... -0.061457 -0.045348 -0.017255 -0.113749 0.067973 -0.014679 0.040786 -0.023000 0.089157 -0.038230
BP 0.215527 -0.025930 -0.072511 0.031916 0.128010 0.570418 -0.079115 0.028781 -0.082999 1.000000 ... 0.036155 -0.129196 0.033902 0.067064 -0.071686 -0.005244 -0.007812 -0.092516 -0.047472 0.024047
PR 0.023576 -0.075468 -0.077549 -0.015680 0.025350 0.124176 0.002796 -0.065240 -0.057717 0.183231 ... 0.108768 -0.070392 0.147650 0.010357 0.080313 -0.141028 0.144888 -0.066714 -0.210017 0.152990
Edema 0.132487 -0.035323 -0.039241 -0.009812 0.016133 0.134600 -0.062343 0.057211 0.004589 0.085339 ... -0.034615 -0.094113 0.015969 -0.004275 0.101513 -0.025142 0.040315 -0.054050 -0.079315 0.083133
Typical Chest Pain 0.138387 -0.002986 0.023149 -0.012911 0.105623 0.122788 0.079987 0.058855 -0.035920 0.114926 ... 0.073777 0.057803 0.126926 -0.026933 -0.022196 -0.076830 0.065572 -0.082399 -0.103957 0.177166
Function Class 0.051424 0.040371 -0.012710 0.064736 0.086200 0.092880 -0.037824 0.024499 0.010162 0.017544 ... 0.096865 0.028994 0.007217 -0.055051 0.075008 -0.004590 -0.012557 0.050548 -0.119119 0.131131
Q Wave -0.061677 0.020584 0.045229 -0.015837 0.072583 -0.043583 -0.084608 0.121591 -0.062024 -0.033462 ... 0.136909 0.007752 0.048677 -0.155067 0.013517 -0.055419 0.029939 0.003692 -0.266077 0.222826
St Elevation -0.056926 0.071406 0.050602 0.029896 0.028955 -0.040627 0.042192 0.047341 -0.009379 -0.061334 ... 0.141827 0.065039 0.112392 -0.103195 0.128728 -0.060937 0.035447 0.070754 -0.231493 0.268545
St Depression 0.177432 -0.114619 -0.150480 -0.009146 0.015532 0.016736 0.023762 -0.014970 0.080075 -0.032295 ... 0.078451 -0.124527 -0.033712 -0.005668 0.107977 -0.019841 0.066265 0.119812 -0.015214 0.165025
Tinversion 0.041913 -0.021033 0.023690 -0.043311 -0.058998 0.115040 0.200881 0.082060 0.054250 0.046643 ... 0.081901 0.067330 0.032172 -0.024134 0.018630 -0.015916 -0.003813 0.011547 -0.121389 0.173485
FBS 0.015385 0.012737 -0.094789 0.089380 0.677940 0.109592 -0.101457 -0.079537 -0.080815 0.145861 ... 0.143823 -0.164087 0.102861 -0.059455 0.159957 -0.003719 0.031787 0.019886 -0.056692 0.037291
CR 0.227097 0.150226 0.162634 0.034338 0.028606 0.158881 -0.046339 0.178112 0.031882 0.077407 ... 0.023793 -0.019728 -0.010450 -0.074997 0.145125 -0.066620 0.096707 -0.091782 -0.115351 0.031475
TG -0.110793 0.078469 -0.034389 0.109422 0.108792 0.045954 0.062399 0.015597 -0.019083 0.019586 ... -0.044736 0.123914 0.023490 0.060313 0.012340 0.090065 -0.081575 -0.049424 -0.027902 0.035353
LDL -0.033576 -0.023233 -0.090970 0.040001 -0.027167 0.022755 -0.025440 -0.025844 0.111292 0.080683 ... -0.013132 0.063645 0.037732 0.168126 0.019056 0.118307 -0.085044 0.013452 0.159394 -0.026927
HDL -0.035793 -0.059713 -0.050594 -0.024338 -0.043890 -0.094226 0.010228 -0.056676 0.078685 -0.012459 ... -0.084301 -0.048461 -0.074145 0.088912 -0.063782 0.028257 -0.024528 0.000638 0.104394 -0.062022
BUN 0.300663 -0.057670 -0.071229 -0.011139 0.144394 0.152895 -0.061596 0.066455 -0.014419 0.038045 ... 0.126928 -0.085335 0.098618 -0.136310 0.088416 -0.044946 0.024421 0.041289 -0.116665 0.018362
ESR 0.183127 -0.139314 -0.222182 0.023259 0.190397 0.161704 -0.121199 0.017858 -0.061457 0.036155 ... 1.000000 -0.389803 0.006577 -0.069327 0.160759 -0.158031 0.138741 0.246826 -0.057497 0.054697
HB -0.161018 0.274218 0.341028 0.031107 -0.156382 -0.115935 0.216144 -0.024528 -0.045348 -0.129196 ... -0.389803 1.000000 0.033308 0.138745 -0.000567 0.083837 -0.075441 -0.106252 0.006186 -0.045481
K 0.154203 -0.018287 -0.086277 0.044587 0.100064 0.011826 -0.016599 -0.004318 -0.017255 0.033902 ... 0.006577 0.033308 1.000000 0.010686 0.118689 -0.008561 -0.002896 0.022865 -0.159512 0.229266
Na -0.071886 0.013916 -0.066590 0.067402 -0.083030 0.036355 0.036812 -0.038716 -0.113749 0.067064 ... -0.069327 0.138745 0.010686 1.000000 -0.093826 0.141032 -0.134406 -0.022049 0.136491 -0.022558
WBC 0.020398 -0.020214 0.066658 -0.074928 0.110345 -0.069374 0.046205 -0.057655 0.067973 -0.071686 ... 0.160759 -0.000567 0.118689 -0.093826 1.000000 -0.322100 0.377770 0.290805 -0.137910 0.175318
Lymph -0.171529 0.020120 -0.156436 0.139583 0.033413 0.017204 -0.053224 0.070524 -0.014679 -0.005244 ... -0.158031 0.083837 -0.008561 0.141032 -0.322100 1.000000 -0.923081 -0.011639 0.239827 -0.079181
Neut 0.173030 -0.051093 0.115075 -0.140037 -0.024417 -0.025476 0.042115 -0.068122 0.040786 -0.007812 ... 0.138741 -0.075441 -0.002896 -0.134406 0.377770 -0.923081 1.000000 0.003637 -0.228776 0.112580
PLT -0.049500 -0.094192 -0.133446 -0.003964 0.051054 -0.043840 -0.048473 -0.079102 -0.023000 -0.092516 ... 0.246826 -0.106252 0.022865 -0.022049 0.290805 -0.011639 0.003637 1.000000 0.068409 -0.010812
EF-TTE -0.140512 0.026789 -0.093295 0.093903 -0.052507 0.031365 -0.068943 -0.015153 0.089157 -0.047472 ... -0.057497 0.006186 -0.159512 0.136491 -0.137910 0.239827 -0.228776 0.068409 1.000000 -0.450799
Region RWMA 0.108663 -0.007648 0.095715 -0.079739 0.064891 -0.000372 0.078479 0.012996 -0.038230 0.024047 ... 0.054697 -0.045481 0.229266 -0.022558 0.175318 -0.079181 0.112580 -0.010812 -0.450799 1.000000

34 rows × 34 columns

In [281]:
plt.figure(figsize=(40, 20))
sns.heatmap(dataset.corr())
top_corr_features = dataset.corr().index
g=sns.heatmap(dataset[top_corr_features].corr(),annot=True,linewidth=.10,cmap="rocket")
/tmp/ipykernel_9901/3249925955.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(dataset.corr())
/tmp/ipykernel_9901/3249925955.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  top_corr_features = dataset.corr().index
In [282]:
plt.figure(figsize=(40, 20))
matrix = np.triu(dataset.corr())
sns.heatmap(dataset.corr(), annot=True, linewidth=.10, mask=matrix, cmap="Paired");
/tmp/ipykernel_9901/2397469981.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  matrix = np.triu(dataset.corr())
/tmp/ipykernel_9901/2397469981.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(dataset.corr(), annot=True, linewidth=.10, mask=matrix, cmap="Paired");
In [283]:
plt.figure(figsize=(40, 20))
sns.histplot(data=dataset.corr(),kde=True,palette='hot')
/tmp/ipykernel_9901/3055010306.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.histplot(data=dataset.corr(),kde=True,palette='hot')
Out[283]:
<Axes: ylabel='Count'>
In [284]:
dataset.nunique()
Out[284]:
Age                       46
Weight                    54
Length                    44
Sex                        2
BMI                      263
DM                         2
HTN                        2
Current Smoker             2
EX-Smoker                  2
FH                         2
Obesity                    2
CRF                        2
CVA                        2
Airway disease             2
Thyroid Disease            2
CHF                        2
DLP                        2
BP                        17
PR                        21
Edema                      2
Weak Peripheral Pulse      2
Lung rales                 2
Systolic Murmur            2
Diastolic Murmur           2
Typical Chest Pain         2
Dyspnea                    2
Function Class             4
Atypical                   2
Nonanginal                 2
Exertional CP              1
LowTH Ang                  2
Q Wave                     2
St Elevation               2
St Depression              2
Tinversion                 2
LVH                        2
Poor R Progression         2
BBB                        3
FBS                      113
CR                        18
TG                       147
LDL                      110
HDL                       47
BUN                       33
ESR                       58
HB                        66
K                         27
Na                        25
WBC                       78
Lymph                     50
Neut                      52
PLT                      135
EF-TTE                    11
Region RWMA                5
VHD                        4
Cath                       2
dtype: int64
In [285]:
unique = pd.DataFrame(dataset.nunique())
In [286]:
plt.figure(figsize=(15, 10))
sns.heatmap(unique,annot=True, linewidth=.10, cmap="Paired")
Out[286]:
<Axes: >

drop irrelevant data¶

In [287]:
dataset.drop(['Exertional CP'], axis=1, inplace=True)
dataset
Out[287]:
Age Weight Length Sex BMI DM HTN Current Smoker EX-Smoker FH ... K Na WBC Lymph Neut PLT EF-TTE Region RWMA VHD Cath
0 53 90 175 Male 29.387755 0 1 1 0 0 ... 4.7 141 5700 39 52 261 50 0 N Cad
1 67 70 157 Fmale 28.398718 0 1 0 0 0 ... 4.7 156 7700 38 55 165 40 4 N Cad
2 54 54 164 Male 20.077335 0 0 1 0 0 ... 4.7 139 7400 38 60 230 40 2 mild Cad
3 66 67 158 Fmale 26.838648 0 1 0 0 0 ... 4.4 142 13000 18 72 742 55 0 Severe Normal
4 50 87 153 Fmale 37.165193 0 1 0 0 0 ... 4.0 140 9200 55 39 274 50 0 Severe Normal
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
298 58 84 168 Male 29.761905 0 0 0 0 0 ... 4.8 146 8500 34 58 251 45 0 N Cad
299 55 64 152 Fmale 27.700831 0 0 0 0 0 ... 4.0 139 11400 16 80 377 40 0 mild Normal
300 48 77 160 Fmale 30.078125 0 1 0 0 1 ... 4.0 140 9000 35 55 279 55 0 N Normal
301 57 90 159 Fmale 35.599858 1 0 0 0 0 ... 3.8 141 3800 48 40 208 55 0 N Normal
302 56 85 170 Fmale 29.411765 0 1 1 0 0 ... 4.4 147 6000 32 55 302 55 0 N Cad

303 rows × 55 columns

In [288]:
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 55 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    303 non-null    int64  
 1   Weight                 303 non-null    int64  
 2   Length                 303 non-null    int64  
 3   Sex                    303 non-null    object 
 4   BMI                    303 non-null    float64
 5   DM                     303 non-null    int64  
 6   HTN                    303 non-null    int64  
 7   Current Smoker         303 non-null    int64  
 8   EX-Smoker              303 non-null    int64  
 9   FH                     303 non-null    int64  
 10  Obesity                303 non-null    object 
 11  CRF                    303 non-null    object 
 12  CVA                    303 non-null    object 
 13  Airway disease         303 non-null    object 
 14  Thyroid Disease        303 non-null    object 
 15  CHF                    303 non-null    object 
 16  DLP                    303 non-null    object 
 17  BP                     303 non-null    int64  
 18  PR                     303 non-null    int64  
 19  Edema                  303 non-null    int64  
 20  Weak Peripheral Pulse  303 non-null    object 
 21  Lung rales             303 non-null    object 
 22  Systolic Murmur        303 non-null    object 
 23  Diastolic Murmur       303 non-null    object 
 24  Typical Chest Pain     303 non-null    int64  
 25  Dyspnea                303 non-null    object 
 26  Function Class         303 non-null    int64  
 27  Atypical               303 non-null    object 
 28  Nonanginal             303 non-null    object 
 29  LowTH Ang              303 non-null    object 
 30  Q Wave                 303 non-null    int64  
 31  St Elevation           303 non-null    int64  
 32  St Depression          303 non-null    int64  
 33  Tinversion             303 non-null    int64  
 34  LVH                    303 non-null    object 
 35  Poor R Progression     303 non-null    object 
 36  BBB                    303 non-null    object 
 37  FBS                    303 non-null    int64  
 38  CR                     303 non-null    float64
 39  TG                     303 non-null    int64  
 40  LDL                    303 non-null    int64  
 41  HDL                    303 non-null    float64
 42  BUN                    303 non-null    int64  
 43  ESR                    303 non-null    int64  
 44  HB                     303 non-null    float64
 45  K                      303 non-null    float64
 46  Na                     303 non-null    int64  
 47  WBC                    303 non-null    int64  
 48  Lymph                  303 non-null    int64  
 49  Neut                   303 non-null    int64  
 50  PLT                    303 non-null    int64  
 51  EF-TTE                 303 non-null    int64  
 52  Region RWMA            303 non-null    int64  
 53  VHD                    303 non-null    object 
 54  Cath                   303 non-null    object 
dtypes: float64(5), int64(29), object(21)
memory usage: 130.3+ KB
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [289]:
plt.figure(figsize=(10,5))
sns.histplot(data=dataset['Cath'],kde=True,palette='hot')
/tmp/ipykernel_9901/2585740825.py:2: UserWarning: Ignoring `palette` because no `hue` variable has been assigned.
  sns.histplot(data=dataset['Cath'],kde=True,palette='hot')
Out[289]:
<Axes: xlabel='Cath', ylabel='Count'>
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Encoding Dataset¶

In [290]:
LE = LabelEncoder()
dataset['Sex'] = LE.fit_transform(dataset['Sex'])
dataset['Obesity'] = LE.fit_transform(dataset['Obesity'])
dataset['CRF'] = LE.fit_transform(dataset['CRF'])
dataset['CVA'] = LE.fit_transform(dataset['CVA'])
dataset['Airway disease'] = LE.fit_transform(dataset['Airway disease'])
dataset['Thyroid Disease'] = LE.fit_transform(dataset['Thyroid Disease'])
dataset['CHF'] = LE.fit_transform(dataset['CHF'])
dataset['DLP'] = LE.fit_transform(dataset['DLP'])
dataset['Weak Peripheral Pulse'] = LE.fit_transform(dataset['Weak Peripheral Pulse'])
dataset['Lung rales'] = LE.fit_transform(dataset['Lung rales'])
dataset['Systolic Murmur'] = LE.fit_transform(dataset['Systolic Murmur'])
dataset['Diastolic Murmur'] = LE.fit_transform(dataset['Diastolic Murmur'])
dataset['Dyspnea'] = LE.fit_transform(dataset['Dyspnea'])
dataset['Atypical'] = LE.fit_transform(dataset['Atypical'])
dataset['Nonanginal'] = LE.fit_transform(dataset['Nonanginal'])
dataset['LowTH Ang'] = LE.fit_transform(dataset['LowTH Ang'])
dataset['LVH'] = LE.fit_transform(dataset['LVH'])
dataset['Poor R Progression'] = LE.fit_transform(dataset['Poor R Progression'])
dataset['BBB'] = LE.fit_transform(dataset['BBB'])
dataset['VHD'] = LE.fit_transform(dataset['VHD'])
dataset['Cath'] = LE.fit_transform(dataset['Cath'])
In [291]:
dataset
Out[291]:
Age Weight Length Sex BMI DM HTN Current Smoker EX-Smoker FH ... K Na WBC Lymph Neut PLT EF-TTE Region RWMA VHD Cath
0 53 90 175 1 29.387755 0 1 1 0 0 ... 4.7 141 5700 39 52 261 50 0 1 0
1 67 70 157 0 28.398718 0 1 0 0 0 ... 4.7 156 7700 38 55 165 40 4 1 0
2 54 54 164 1 20.077335 0 0 1 0 0 ... 4.7 139 7400 38 60 230 40 2 3 0
3 66 67 158 0 26.838648 0 1 0 0 0 ... 4.4 142 13000 18 72 742 55 0 2 1
4 50 87 153 0 37.165193 0 1 0 0 0 ... 4.0 140 9200 55 39 274 50 0 2 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
298 58 84 168 1 29.761905 0 0 0 0 0 ... 4.8 146 8500 34 58 251 45 0 1 0
299 55 64 152 0 27.700831 0 0 0 0 0 ... 4.0 139 11400 16 80 377 40 0 3 1
300 48 77 160 0 30.078125 0 1 0 0 1 ... 4.0 140 9000 35 55 279 55 0 1 1
301 57 90 159 0 35.599858 1 0 0 0 0 ... 3.8 141 3800 48 40 208 55 0 1 1
302 56 85 170 0 29.411765 0 1 1 0 0 ... 4.4 147 6000 32 55 302 55 0 1 0

303 rows × 55 columns

In [292]:
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 55 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    303 non-null    int64  
 1   Weight                 303 non-null    int64  
 2   Length                 303 non-null    int64  
 3   Sex                    303 non-null    int64  
 4   BMI                    303 non-null    float64
 5   DM                     303 non-null    int64  
 6   HTN                    303 non-null    int64  
 7   Current Smoker         303 non-null    int64  
 8   EX-Smoker              303 non-null    int64  
 9   FH                     303 non-null    int64  
 10  Obesity                303 non-null    int64  
 11  CRF                    303 non-null    int64  
 12  CVA                    303 non-null    int64  
 13  Airway disease         303 non-null    int64  
 14  Thyroid Disease        303 non-null    int64  
 15  CHF                    303 non-null    int64  
 16  DLP                    303 non-null    int64  
 17  BP                     303 non-null    int64  
 18  PR                     303 non-null    int64  
 19  Edema                  303 non-null    int64  
 20  Weak Peripheral Pulse  303 non-null    int64  
 21  Lung rales             303 non-null    int64  
 22  Systolic Murmur        303 non-null    int64  
 23  Diastolic Murmur       303 non-null    int64  
 24  Typical Chest Pain     303 non-null    int64  
 25  Dyspnea                303 non-null    int64  
 26  Function Class         303 non-null    int64  
 27  Atypical               303 non-null    int64  
 28  Nonanginal             303 non-null    int64  
 29  LowTH Ang              303 non-null    int64  
 30  Q Wave                 303 non-null    int64  
 31  St Elevation           303 non-null    int64  
 32  St Depression          303 non-null    int64  
 33  Tinversion             303 non-null    int64  
 34  LVH                    303 non-null    int64  
 35  Poor R Progression     303 non-null    int64  
 36  BBB                    303 non-null    int64  
 37  FBS                    303 non-null    int64  
 38  CR                     303 non-null    float64
 39  TG                     303 non-null    int64  
 40  LDL                    303 non-null    int64  
 41  HDL                    303 non-null    float64
 42  BUN                    303 non-null    int64  
 43  ESR                    303 non-null    int64  
 44  HB                     303 non-null    float64
 45  K                      303 non-null    float64
 46  Na                     303 non-null    int64  
 47  WBC                    303 non-null    int64  
 48  Lymph                  303 non-null    int64  
 49  Neut                   303 non-null    int64  
 50  PLT                    303 non-null    int64  
 51  EF-TTE                 303 non-null    int64  
 52  Region RWMA            303 non-null    int64  
 53  VHD                    303 non-null    int64  
 54  Cath                   303 non-null    int64  
dtypes: float64(5), int64(50)
memory usage: 130.3 KB
In [293]:
dataset.describe()
Out[293]:
Age Weight Length Sex BMI DM HTN Current Smoker EX-Smoker FH ... K Na WBC Lymph Neut PLT EF-TTE Region RWMA VHD Cath
count 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 ... 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000
mean 58.897690 73.831683 164.716172 0.580858 27.248339 0.297030 0.590759 0.207921 0.033003 0.158416 ... 4.230693 140.996700 7562.046205 32.399340 60.148515 221.488449 47.231023 0.620462 1.930693 0.287129
std 10.392278 11.987358 9.327661 0.494235 4.098865 0.457706 0.492507 0.406491 0.178941 0.365734 ... 0.458202 3.807885 2413.739323 9.972592 10.182493 60.796199 8.927194 1.132531 1.109180 0.453171
min 30.000000 48.000000 140.000000 0.000000 18.115413 0.000000 0.000000 0.000000 0.000000 0.000000 ... 3.000000 128.000000 3700.000000 7.000000 32.000000 25.000000 15.000000 0.000000 0.000000 0.000000
25% 51.000000 65.000000 158.000000 0.000000 24.514380 0.000000 0.000000 0.000000 0.000000 0.000000 ... 3.900000 139.000000 5800.000000 26.000000 52.500000 183.500000 45.000000 0.000000 1.000000 0.000000
50% 58.000000 74.000000 165.000000 1.000000 26.775510 0.000000 1.000000 0.000000 0.000000 0.000000 ... 4.200000 141.000000 7100.000000 32.000000 60.000000 210.000000 50.000000 0.000000 2.000000 0.000000
75% 66.000000 81.000000 171.000000 1.000000 29.411765 1.000000 1.000000 0.000000 0.000000 0.000000 ... 4.500000 143.000000 8800.000000 39.000000 67.000000 250.000000 55.000000 1.000000 3.000000 1.000000
max 86.000000 120.000000 188.000000 1.000000 40.900658 1.000000 1.000000 1.000000 1.000000 1.000000 ... 6.600000 156.000000 18000.000000 60.000000 89.000000 742.000000 60.000000 4.000000 3.000000 1.000000

8 rows × 55 columns

In [294]:
sns.heatmap(dataset.describe())
Out[294]:
<Axes: >
In [295]:
plt.figure(figsize=(40, 20))
sns.heatmap(dataset.corr())
top_corr_features = dataset.corr().index
g=sns.heatmap(dataset[top_corr_features].corr(),annot=True,linewidth=.10,cmap="rocket")
In [296]:
plt.figure(figsize=(40, 20))
matrix = np.triu(dataset.corr())
sns.heatmap(dataset.corr(), annot=True, linewidth=.10, mask=matrix, cmap="Paired");
In [297]:
plt.figure(figsize=(40, 20))
sns.histplot(data=dataset.corr(),kde=True,palette='hot')
Out[297]:
<Axes: ylabel='Count'>
In [298]:
plt.figure(figsize=(15, 12))
sns.countplot (dataset['Age'])
Out[298]:
<Axes: ylabel='count'>
In [299]:
sns.countplot (dataset['Cath'])
Out[299]:
<Axes: ylabel='count'>
In [300]:
dataset
Out[300]:
Age Weight Length Sex BMI DM HTN Current Smoker EX-Smoker FH ... K Na WBC Lymph Neut PLT EF-TTE Region RWMA VHD Cath
0 53 90 175 1 29.387755 0 1 1 0 0 ... 4.7 141 5700 39 52 261 50 0 1 0
1 67 70 157 0 28.398718 0 1 0 0 0 ... 4.7 156 7700 38 55 165 40 4 1 0
2 54 54 164 1 20.077335 0 0 1 0 0 ... 4.7 139 7400 38 60 230 40 2 3 0
3 66 67 158 0 26.838648 0 1 0 0 0 ... 4.4 142 13000 18 72 742 55 0 2 1
4 50 87 153 0 37.165193 0 1 0 0 0 ... 4.0 140 9200 55 39 274 50 0 2 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
298 58 84 168 1 29.761905 0 0 0 0 0 ... 4.8 146 8500 34 58 251 45 0 1 0
299 55 64 152 0 27.700831 0 0 0 0 0 ... 4.0 139 11400 16 80 377 40 0 3 1
300 48 77 160 0 30.078125 0 1 0 0 1 ... 4.0 140 9000 35 55 279 55 0 1 1
301 57 90 159 0 35.599858 1 0 0 0 0 ... 3.8 141 3800 48 40 208 55 0 1 1
302 56 85 170 0 29.411765 0 1 1 0 0 ... 4.4 147 6000 32 55 302 55 0 1 0

303 rows × 55 columns

In [303]:
df = dataset['Cath'].values.sum()
df
Out[303]:
87
In [304]:
303-87
Out[304]:
216
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Data Scaling¶

In [305]:
dataset
Out[305]:
Age Weight Length Sex BMI DM HTN Current Smoker EX-Smoker FH ... K Na WBC Lymph Neut PLT EF-TTE Region RWMA VHD Cath
0 53 90 175 1 29.387755 0 1 1 0 0 ... 4.7 141 5700 39 52 261 50 0 1 0
1 67 70 157 0 28.398718 0 1 0 0 0 ... 4.7 156 7700 38 55 165 40 4 1 0
2 54 54 164 1 20.077335 0 0 1 0 0 ... 4.7 139 7400 38 60 230 40 2 3 0
3 66 67 158 0 26.838648 0 1 0 0 0 ... 4.4 142 13000 18 72 742 55 0 2 1
4 50 87 153 0 37.165193 0 1 0 0 0 ... 4.0 140 9200 55 39 274 50 0 2 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
298 58 84 168 1 29.761905 0 0 0 0 0 ... 4.8 146 8500 34 58 251 45 0 1 0
299 55 64 152 0 27.700831 0 0 0 0 0 ... 4.0 139 11400 16 80 377 40 0 3 1
300 48 77 160 0 30.078125 0 1 0 0 1 ... 4.0 140 9000 35 55 279 55 0 1 1
301 57 90 159 0 35.599858 1 0 0 0 0 ... 3.8 141 3800 48 40 208 55 0 1 1
302 56 85 170 0 29.411765 0 1 1 0 0 ... 4.4 147 6000 32 55 302 55 0 1 0

303 rows × 55 columns

In [ ]:
 
In [306]:
"""dataset = np.array(dataset)
dataset"""
Out[306]:
'dataset = np.array(dataset)\ndataset'
In [307]:
'''SC = StandardScaler()
dataset[:,0:3] = SC.fit_transform(dataset[:,0:3])
dataset[:,4:5] = SC.fit_transform(dataset[:,4:5])
dataset[:,17:19] = SC.fit_transform(dataset[:,17:19])
dataset[:,37:38] = SC.fit_transform(dataset[:,37:38])
dataset[:,39:53] = SC.fit_transform(dataset[:,39:53])
dataset'''
Out[307]:
'SC = StandardScaler()\ndataset[:,0:3] = SC.fit_transform(dataset[:,0:3])\ndataset[:,4:5] = SC.fit_transform(dataset[:,4:5])\ndataset[:,17:19] = SC.fit_transform(dataset[:,17:19])\ndataset[:,37:38] = SC.fit_transform(dataset[:,37:38])\ndataset[:,39:53] = SC.fit_transform(dataset[:,39:53])\ndataset'
In [308]:
'''dataset = pd.DataFrame(dataset,columns = ['Age' , 'Weight' ,'Length', 'Sex' , 'BMI' ,'DM', 'HTN' , 'Current Smoker', 'EX-Smoker' , 'FH' , 'Obesity' , 
                                      'CRF','CVA','Airway disease','Thyroid Disease','CHF' , 'DLP' ,'BP', 'PR','Edema' , 'Weak Peripheral Pulse' , 
                                      'Lung rales' , 'Systolic Murmur' , 'Diastolic Murmur','Typical Chest Pain'  , 'Dyspnea' , 
                                      'Function Class' ,'Atypical', 'Nonanginal' , 'LowTH Ang' ,'Q Wave' , 'St Elevation' , 'St Depression' , 
                                      'Tinversion' , 'LVH' , 'Poor R Progression' , 'BBB' ,'FBS' , 'CR' , 'TG' , 'LDL' , 'HDL' , 'BUN' , 'ESR' , 
                                      'HB' , 'K' , 'Na' , 'WBC' , 'Lymph' , 'Neut' ,'PLT' , 'EF-TTE' , 'Region RWMA' , 'VHD','Cath'])
dataset'''
Out[308]:
"dataset = pd.DataFrame(dataset,columns = ['Age' , 'Weight' ,'Length', 'Sex' , 'BMI' ,'DM', 'HTN' , 'Current Smoker', 'EX-Smoker' , 'FH' , 'Obesity' , \n                                      'CRF','CVA','Airway disease','Thyroid Disease','CHF' , 'DLP' ,'BP', 'PR','Edema' , 'Weak Peripheral Pulse' , \n                                      'Lung rales' , 'Systolic Murmur' , 'Diastolic Murmur','Typical Chest Pain'  , 'Dyspnea' , \n                                      'Function Class' ,'Atypical', 'Nonanginal' , 'LowTH Ang' ,'Q Wave' , 'St Elevation' , 'St Depression' , \n                                      'Tinversion' , 'LVH' , 'Poor R Progression' , 'BBB' ,'FBS' , 'CR' , 'TG' , 'LDL' , 'HDL' , 'BUN' , 'ESR' , \n                                      'HB' , 'K' , 'Na' , 'WBC' , 'Lymph' , 'Neut' ,'PLT' , 'EF-TTE' , 'Region RWMA' , 'VHD','Cath'])\ndataset"
In [309]:
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 55 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    303 non-null    int64  
 1   Weight                 303 non-null    int64  
 2   Length                 303 non-null    int64  
 3   Sex                    303 non-null    int64  
 4   BMI                    303 non-null    float64
 5   DM                     303 non-null    int64  
 6   HTN                    303 non-null    int64  
 7   Current Smoker         303 non-null    int64  
 8   EX-Smoker              303 non-null    int64  
 9   FH                     303 non-null    int64  
 10  Obesity                303 non-null    int64  
 11  CRF                    303 non-null    int64  
 12  CVA                    303 non-null    int64  
 13  Airway disease         303 non-null    int64  
 14  Thyroid Disease        303 non-null    int64  
 15  CHF                    303 non-null    int64  
 16  DLP                    303 non-null    int64  
 17  BP                     303 non-null    int64  
 18  PR                     303 non-null    int64  
 19  Edema                  303 non-null    int64  
 20  Weak Peripheral Pulse  303 non-null    int64  
 21  Lung rales             303 non-null    int64  
 22  Systolic Murmur        303 non-null    int64  
 23  Diastolic Murmur       303 non-null    int64  
 24  Typical Chest Pain     303 non-null    int64  
 25  Dyspnea                303 non-null    int64  
 26  Function Class         303 non-null    int64  
 27  Atypical               303 non-null    int64  
 28  Nonanginal             303 non-null    int64  
 29  LowTH Ang              303 non-null    int64  
 30  Q Wave                 303 non-null    int64  
 31  St Elevation           303 non-null    int64  
 32  St Depression          303 non-null    int64  
 33  Tinversion             303 non-null    int64  
 34  LVH                    303 non-null    int64  
 35  Poor R Progression     303 non-null    int64  
 36  BBB                    303 non-null    int64  
 37  FBS                    303 non-null    int64  
 38  CR                     303 non-null    float64
 39  TG                     303 non-null    int64  
 40  LDL                    303 non-null    int64  
 41  HDL                    303 non-null    float64
 42  BUN                    303 non-null    int64  
 43  ESR                    303 non-null    int64  
 44  HB                     303 non-null    float64
 45  K                      303 non-null    float64
 46  Na                     303 non-null    int64  
 47  WBC                    303 non-null    int64  
 48  Lymph                  303 non-null    int64  
 49  Neut                   303 non-null    int64  
 50  PLT                    303 non-null    int64  
 51  EF-TTE                 303 non-null    int64  
 52  Region RWMA            303 non-null    int64  
 53  VHD                    303 non-null    int64  
 54  Cath                   303 non-null    int64  
dtypes: float64(5), int64(50)
memory usage: 130.3 KB
In [310]:
dataset.describe()
Out[310]:
Age Weight Length Sex BMI DM HTN Current Smoker EX-Smoker FH ... K Na WBC Lymph Neut PLT EF-TTE Region RWMA VHD Cath
count 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 ... 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000
mean 58.897690 73.831683 164.716172 0.580858 27.248339 0.297030 0.590759 0.207921 0.033003 0.158416 ... 4.230693 140.996700 7562.046205 32.399340 60.148515 221.488449 47.231023 0.620462 1.930693 0.287129
std 10.392278 11.987358 9.327661 0.494235 4.098865 0.457706 0.492507 0.406491 0.178941 0.365734 ... 0.458202 3.807885 2413.739323 9.972592 10.182493 60.796199 8.927194 1.132531 1.109180 0.453171
min 30.000000 48.000000 140.000000 0.000000 18.115413 0.000000 0.000000 0.000000 0.000000 0.000000 ... 3.000000 128.000000 3700.000000 7.000000 32.000000 25.000000 15.000000 0.000000 0.000000 0.000000
25% 51.000000 65.000000 158.000000 0.000000 24.514380 0.000000 0.000000 0.000000 0.000000 0.000000 ... 3.900000 139.000000 5800.000000 26.000000 52.500000 183.500000 45.000000 0.000000 1.000000 0.000000
50% 58.000000 74.000000 165.000000 1.000000 26.775510 0.000000 1.000000 0.000000 0.000000 0.000000 ... 4.200000 141.000000 7100.000000 32.000000 60.000000 210.000000 50.000000 0.000000 2.000000 0.000000
75% 66.000000 81.000000 171.000000 1.000000 29.411765 1.000000 1.000000 0.000000 0.000000 0.000000 ... 4.500000 143.000000 8800.000000 39.000000 67.000000 250.000000 55.000000 1.000000 3.000000 1.000000
max 86.000000 120.000000 188.000000 1.000000 40.900658 1.000000 1.000000 1.000000 1.000000 1.000000 ... 6.600000 156.000000 18000.000000 60.000000 89.000000 742.000000 60.000000 4.000000 3.000000 1.000000

8 rows × 55 columns

In [311]:
dataset.corr()
Out[311]:
Age Weight Length Sex BMI DM HTN Current Smoker EX-Smoker FH ... K Na WBC Lymph Neut PLT EF-TTE Region RWMA VHD Cath
Age 1.000000 -0.264585 -0.163753 -0.045769 -0.161414 0.072543 0.246690 -0.143879 0.076608 -0.183900 ... 0.154203 -0.071886 0.020398 -0.171529 0.173030 -0.049500 -0.140512 0.108663 0.117735 -0.357247
Weight -0.264585 1.000000 0.460631 0.234529 0.725005 -0.003531 -0.028532 0.157385 0.068977 0.021963 ... -0.018287 0.013916 -0.020214 0.020120 -0.051093 -0.094192 0.026789 -0.007648 -0.092527 0.066833
Length -0.163753 0.460631 1.000000 0.700279 -0.269356 -0.052318 -0.153668 0.335248 0.079034 0.004488 ... -0.086277 -0.066590 0.066658 -0.156436 0.115075 -0.133446 -0.093295 0.095715 -0.085441 -0.001024
Sex -0.045769 0.234529 0.700279 1.000000 -0.284088 -0.194348 -0.149278 0.336330 0.156932 -0.071098 ... -0.038046 -0.079913 0.086823 -0.132539 0.112422 -0.143698 -0.230896 0.128955 -0.059207 -0.067041
BMI -0.161414 0.725005 -0.269356 -0.284088 1.000000 0.045360 0.091652 -0.089398 0.005016 0.014045 ... 0.044587 0.067402 -0.074928 0.139583 -0.140037 -0.003964 0.093903 -0.079739 -0.037365 0.078189
DM 0.072543 -0.003531 -0.052318 -0.194348 0.045360 1.000000 0.217864 -0.208458 -0.120087 -0.064434 ... 0.100064 -0.083030 0.110345 0.033413 -0.024417 0.051054 -0.052507 0.064891 0.001550 -0.252897
HTN 0.246690 -0.028532 -0.153668 -0.149278 0.091652 0.217864 1.000000 -0.169000 0.041045 -0.098467 ... 0.011826 0.036355 -0.069374 0.017204 -0.025476 -0.043840 0.031365 -0.000372 0.117629 -0.287761
Current Smoker -0.143879 0.157385 0.335248 0.336330 -0.089398 -0.208458 -0.169000 1.000000 -0.094652 0.089532 ... -0.016599 0.036812 0.046205 -0.053224 0.042115 -0.048473 -0.068943 0.078479 -0.092783 -0.073504
EX-Smoker 0.076608 0.068977 0.079034 0.156932 0.005016 -0.120087 0.041045 -0.094652 1.000000 -0.080152 ... -0.004318 -0.038716 -0.057655 0.070524 -0.068122 -0.079102 -0.015153 0.012996 -0.005121 -0.035578
FH -0.183900 0.021963 0.004488 -0.071098 0.014045 -0.064434 -0.098467 0.089532 -0.080152 1.000000 ... -0.017255 -0.113749 0.067973 -0.014679 0.040786 -0.023000 0.089157 -0.038230 -0.078958 -0.035605
Obesity -0.126190 0.547267 -0.171962 -0.211808 0.712501 0.020839 0.136482 -0.050782 0.041635 0.011288 ... 0.001942 0.042850 -0.053290 0.128132 -0.126619 -0.008876 0.104896 -0.056608 0.004043 0.022461
CRF 0.126980 -0.025713 -0.033826 0.024718 0.009386 0.114975 0.118299 0.043924 0.106345 0.068090 ... -0.035429 -0.031033 -0.064608 -0.019977 -0.002076 0.072621 -0.048870 -0.057046 0.008896 -0.090205
CVA 0.026247 0.051610 -0.007180 0.005025 0.066967 0.029189 0.055122 -0.002528 -0.023930 0.014752 ... 0.036615 0.013742 0.036442 -0.036420 0.023592 -0.049700 -0.134162 -0.002344 0.008107 -0.024946
Airway disease 0.069941 -0.057719 0.004021 0.021834 -0.062781 0.028291 0.053887 0.074474 -0.035857 -0.084208 ... -0.059309 -0.018397 0.003789 -0.002468 -0.013250 -0.045168 0.010807 0.018335 0.044016 -0.084179
Thyroid Disease -0.095886 0.033370 -0.042495 -0.091986 0.068834 -0.051885 0.038634 -0.078789 -0.028410 0.053614 ... -0.029528 0.023249 0.044358 -0.021614 0.025847 0.036405 0.060102 -0.084389 -0.010214 0.048077
CHF -0.021618 0.029659 0.014113 0.048881 0.020205 -0.037405 -0.069137 -0.029482 -0.010631 -0.024966 ... 0.147091 -0.166453 0.058217 -0.013867 0.055765 -0.010892 -0.175818 0.171997 -0.100329 -0.036520
DLP 0.128147 -0.080068 -0.173428 -0.277911 0.046570 0.250364 0.108948 -0.190162 -0.103196 0.060996 ... 0.030825 0.002463 0.103705 0.042078 -0.013205 0.029321 0.145854 -0.057397 -0.007642 0.012718
BP 0.215527 -0.025930 -0.072511 -0.071315 0.031916 0.128010 0.570418 -0.079115 0.028781 -0.082999 ... 0.033902 0.067064 -0.071686 -0.005244 -0.007812 -0.092516 -0.047472 0.024047 0.112653 -0.237762
PR 0.023576 -0.075468 -0.077549 -0.095459 -0.015680 0.025350 0.124176 0.002796 -0.065240 -0.057717 ... 0.147650 0.010357 0.080313 -0.141028 0.144888 -0.066714 -0.210017 0.152990 -0.045900 -0.168366
Edema 0.132487 -0.035323 -0.039241 0.035315 -0.009812 0.016133 0.134600 -0.062343 0.057211 0.004589 ... 0.015969 -0.004275 0.101513 -0.025142 0.040315 -0.054050 -0.079315 0.083133 -0.002572 -0.054069
Weak Peripheral Pulse 0.153593 -0.021990 -0.037781 -0.047479 -0.001103 0.029189 0.107811 -0.066365 -0.023930 -0.056199 ... 0.070595 -0.013517 0.033217 -0.044226 0.013398 -0.018115 -0.017891 -0.048169 0.031502 -0.082207
Lung rales 0.105655 -0.097527 -0.031980 0.129113 -0.079006 -0.048936 -0.053769 -0.055963 -0.035857 0.060764 ... 0.044835 -0.101942 0.167806 -0.075130 0.087421 -0.014644 -0.315855 0.158786 -0.003786 -0.045179
Systolic Murmur 0.044819 -0.092795 -0.057362 0.042723 -0.059971 -0.109338 -0.063208 -0.012476 0.088946 -0.039506 ... 0.032515 -0.004733 0.049072 -0.001331 -0.027609 0.020184 -0.228935 0.073055 -0.132077 0.004856
Diastolic Murmur 0.029832 -0.052770 0.007420 -0.008972 -0.062039 -0.028643 -0.052064 -0.041738 -0.032323 0.030575 ... -0.049987 -0.061213 0.012437 -0.120269 0.094974 0.183080 -0.043798 -0.078819 -0.059273 0.146777
Typical Chest Pain 0.138387 -0.002986 0.023149 0.036770 -0.012911 0.105623 0.122788 0.079987 0.058855 -0.035920 ... 0.126926 -0.026933 -0.022196 -0.076830 0.065572 -0.082399 -0.103957 0.177166 0.115833 -0.542967
Dyspnea 0.059379 -0.067434 -0.092743 -0.038180 0.015020 0.090134 0.038359 -0.095978 0.021484 -0.058742 ... -0.035051 -0.063902 0.040498 0.011673 -0.038503 0.042101 -0.132685 -0.100747 -0.022281 0.125211
Function Class 0.051424 0.040371 -0.012710 -0.043835 0.064736 0.086200 0.092880 -0.037824 0.024499 0.010162 ... 0.007217 -0.055051 0.075008 -0.004590 -0.012557 0.050548 -0.119119 0.131131 0.034511 -0.097087
Atypical -0.141722 -0.012166 -0.061937 -0.043794 0.028345 -0.088066 -0.144666 -0.058833 -0.082886 0.083629 ... -0.113479 0.079633 -0.053956 0.125676 -0.094894 -0.042610 0.162596 -0.187982 -0.126360 0.415922
Nonanginal -0.088702 -0.009010 0.005612 -0.068600 -0.025488 -0.088890 0.016443 -0.084608 0.038985 -0.062024 ... -0.061006 0.023496 -0.029963 0.015727 -0.032483 0.164402 0.098194 -0.090414 -0.065181 0.274184
LowTH Ang 0.087227 -0.056749 -0.071919 -0.095959 -0.018212 -0.052986 0.067845 0.058668 -0.015059 -0.035366 ... -0.005469 0.160886 0.067246 0.140009 -0.121469 0.007402 -0.043270 0.171551 0.005102 -0.051733
Q Wave -0.061677 0.020584 0.045229 0.051031 -0.015837 0.072583 -0.043583 -0.084608 0.121591 -0.062024 ... 0.048677 -0.155067 0.013517 -0.055419 0.029939 0.003692 -0.266077 0.222826 0.028104 -0.149848
St Elevation -0.056926 0.071406 0.050602 0.091379 0.029896 0.028955 -0.040627 0.042192 0.047341 -0.009379 ... 0.112392 -0.103195 0.128728 -0.060937 0.035447 0.070754 -0.231493 0.268545 0.127353 -0.139684
St Depression 0.177432 -0.114619 -0.150480 -0.114342 -0.009146 0.015532 0.016736 0.023762 -0.014970 0.080075 ... -0.033712 -0.005668 0.107977 -0.019841 0.066265 0.119812 -0.015214 0.165025 -0.042775 -0.144426
Tinversion 0.041913 -0.021033 0.023690 0.054493 -0.043311 -0.058998 0.115040 0.200881 0.082060 0.054250 ... 0.032172 -0.024134 0.018630 -0.015916 -0.003813 0.011547 -0.121389 0.173485 0.001550 -0.236933
LVH 0.125612 0.100367 -0.007599 -0.016626 0.110330 -0.056449 0.221262 -0.103449 0.025293 -0.115338 ... 0.011220 0.084145 -0.005190 -0.028019 -0.027420 0.014723 0.030394 -0.051835 0.052649 -0.051196
Poor R Progression 0.003599 -0.106376 -0.053120 -0.048372 -0.081253 0.269163 0.027011 -0.041738 -0.032323 0.083817 ... -0.007490 -0.050986 0.010823 -0.092933 0.052902 -0.078598 -0.087423 -0.044432 0.010951 -0.111040
BBB 0.005434 -0.033420 0.058785 -0.002436 -0.072387 -0.014153 -0.052273 -0.029726 0.011603 -0.007153 ... -0.036975 0.075941 -0.042690 0.002519 -0.016381 -0.090139 0.128472 -0.076629 -0.015274 -0.043433
FBS 0.015385 0.012737 -0.094789 -0.217349 0.089380 0.677940 0.109592 -0.101457 -0.079537 -0.080815 ... 0.102861 -0.059455 0.159957 -0.003719 0.031787 0.019886 -0.056692 0.037291 0.046482 -0.205553
CR 0.227097 0.150226 0.162634 0.260150 0.034338 0.028606 0.158881 -0.046339 0.178112 0.031882 ... -0.010450 -0.074997 0.145125 -0.066620 0.096707 -0.091782 -0.115351 0.031475 0.032393 -0.086758
TG -0.110793 0.078469 -0.034389 -0.016784 0.109422 0.108792 0.045954 0.062399 0.015597 -0.019083 ... 0.023490 0.060313 0.012340 0.090065 -0.081575 -0.049424 -0.027902 0.035353 -0.010386 -0.140593
LDL -0.033576 -0.023233 -0.090970 -0.104153 0.040001 -0.027167 0.022755 -0.025440 -0.025844 0.111292 ... 0.037732 0.168126 0.019056 0.118307 -0.085044 0.013452 0.159394 -0.026927 -0.031837 0.023535
HDL -0.035793 -0.059713 -0.050594 -0.116294 -0.024338 -0.043890 -0.094226 0.010228 -0.056676 0.078685 ... -0.074145 0.088912 -0.063782 0.028257 -0.024528 0.000638 0.104394 -0.062022 -0.130615 0.042587
BUN 0.300663 -0.057670 -0.071229 0.009351 -0.011139 0.144394 0.152895 -0.061596 0.066455 -0.014419 ... 0.098618 -0.136310 0.088416 -0.044946 0.024421 0.041289 -0.116665 0.018362 0.066315 -0.088903
ESR 0.183127 -0.139314 -0.222182 -0.306189 0.023259 0.190397 0.161704 -0.121199 0.017858 -0.061457 ... 0.006577 -0.069327 0.160759 -0.158031 0.138741 0.246826 -0.057497 0.054697 0.038346 -0.178447
HB -0.161018 0.274218 0.341028 0.403496 0.031107 -0.156382 -0.115935 0.216144 -0.024528 -0.045348 ... 0.033308 0.138745 -0.000567 0.083837 -0.075441 -0.106252 0.006186 -0.045481 -0.017939 0.042416
K 0.154203 -0.018287 -0.086277 -0.038046 0.044587 0.100064 0.011826 -0.016599 -0.004318 -0.017255 ... 1.000000 0.010686 0.118689 -0.008561 -0.002896 0.022865 -0.159512 0.229266 -0.040756 -0.181320
Na -0.071886 0.013916 -0.066590 -0.079913 0.067402 -0.083030 0.036355 0.036812 -0.038716 -0.113749 ... 0.010686 1.000000 -0.093826 0.141032 -0.134406 -0.022049 0.136491 -0.022558 -0.031414 0.084982
WBC 0.020398 -0.020214 0.066658 0.086823 -0.074928 0.110345 -0.069374 0.046205 -0.057655 0.067973 ... 0.118689 -0.093826 1.000000 -0.322100 0.377770 0.290805 -0.137910 0.175318 -0.011746 -0.070830
Lymph -0.171529 0.020120 -0.156436 -0.132539 0.139583 0.033413 0.017204 -0.053224 0.070524 -0.014679 ... -0.008561 0.141032 -0.322100 1.000000 -0.923081 -0.011639 0.239827 -0.079181 -0.037004 0.126945
Neut 0.173030 -0.051093 0.115075 0.112422 -0.140037 -0.024417 -0.025476 0.042115 -0.068122 0.040786 ... -0.002896 -0.134406 0.377770 -0.923081 1.000000 0.003637 -0.228776 0.112580 0.014987 -0.124086
PLT -0.049500 -0.094192 -0.133446 -0.143698 -0.003964 0.051054 -0.043840 -0.048473 -0.079102 -0.023000 ... 0.022865 -0.022049 0.290805 -0.011639 0.003637 1.000000 0.068409 -0.010812 0.021471 0.094888
EF-TTE -0.140512 0.026789 -0.093295 -0.230896 0.093903 -0.052507 0.031365 -0.068943 -0.015153 0.089157 ... -0.159512 0.136491 -0.137910 0.239827 -0.228776 0.068409 1.000000 -0.450799 0.083887 0.234009
Region RWMA 0.108663 -0.007648 0.095715 0.128955 -0.079739 0.064891 -0.000372 0.078479 0.012996 -0.038230 ... 0.229266 -0.022558 0.175318 -0.079181 0.112580 -0.010812 -0.450799 1.000000 0.005350 -0.316011
VHD 0.117735 -0.092527 -0.085441 -0.059207 -0.037365 0.001550 0.117629 -0.092783 -0.005121 -0.078958 ... -0.040756 -0.031414 -0.011746 -0.037004 0.014987 0.021471 0.083887 0.005350 1.000000 -0.065681
Cath -0.357247 0.066833 -0.001024 -0.067041 0.078189 -0.252897 -0.287761 -0.073504 -0.035578 -0.035605 ... -0.181320 0.084982 -0.070830 0.126945 -0.124086 0.094888 0.234009 -0.316011 -0.065681 1.000000

55 rows × 55 columns

In [312]:
plt.figure(figsize=(40, 20))
matrix = np.triu(dataset.corr())
sns.heatmap(dataset.corr(), annot=True, linewidth=.10, mask=matrix, cmap="Paired");
In [ ]:
 
In [313]:
x = dataset.iloc[:,:-1].values
x
Out[313]:
array([[ 53.,  90., 175., ...,  50.,   0.,   1.],
       [ 67.,  70., 157., ...,  40.,   4.,   1.],
       [ 54.,  54., 164., ...,  40.,   2.,   3.],
       ...,
       [ 48.,  77., 160., ...,  55.,   0.,   1.],
       [ 57.,  90., 159., ...,  55.,   0.,   1.],
       [ 56.,  85., 170., ...,  55.,   0.,   1.]])
In [314]:
y = dataset.iloc[:,-1].values
y
Out[314]:
array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0])
In [315]:
"""from sklearn.decomposition import PCA
pca = PCA()
x = pca.fit_transform(x)
x"""
Out[315]:
'from sklearn.decomposition import PCA\npca = PCA()\nx = pca.fit_transform(x)\nx'
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [177]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.1,random_state = 0)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [178]:
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
In [179]:
#!pip install catboost
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

SVC(87%)¶

In [180]:
svc = SVC(kernel='linear', max_iter=500)
svc.fit(x_train,y_train)
/home/mahmoudragab/anaconda3/lib/python3.10/site-packages/sklearn/svm/_base.py:299: ConvergenceWarning: Solver terminated early (max_iter=500).  Consider pre-processing your data with StandardScaler or MinMaxScaler.
  warnings.warn(
Out[180]:
SVC(kernel='linear', max_iter=500)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(kernel='linear', max_iter=500)
In [181]:
y_pred = svc.predict(x_test)
y_pred
Out[181]:
array([1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1])
In [182]:
y_test
Out[182]:
array([0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1])
In [183]:
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           0       0.57      0.40      0.47        20
           1       0.29      0.45      0.36        11

    accuracy                           0.42        31
   macro avg       0.43      0.43      0.41        31
weighted avg       0.47      0.42      0.43        31

In [ ]:
 
In [184]:
CM = confusion_matrix (y_test,y_pred)
sns.heatmap(CM,annot=True,
            fmt='g',
            xticklabels=['CAD','Normal'],
            yticklabels=['CAD','Normal'])
print (CM)
[[ 8 12]
 [ 6  5]]
In [185]:
df_comp = pd.DataFrame({'Actual':y_test , 'Predict':y_pred})
df_comp
Out[185]:
Actual Predict
0 0 1
1 0 1
2 0 1
3 1 1
4 0 1
5 0 0
6 1 1
7 1 0
8 0 1
9 1 0
10 0 1
11 0 0
12 1 0
13 0 0
14 0 0
15 0 1
16 0 0
17 0 0
18 0 1
19 0 0
20 0 0
21 1 1
22 0 1
23 1 1
24 1 0
25 1 0
26 0 1
27 1 0
28 0 1
29 0 1
30 1 1
In [186]:
print ('y_test = ',y_test.sum())
print ('y_pred = ',y_pred.sum())
y_test =  11
y_pred =  17
In [187]:
plt.title (' Actual & Predict ',color = 'r')
sns.heatmap(df_comp)
Out[187]:
<Axes: title={'center': ' Actual & Predict '}>
In [188]:
plt.style.use('default')
plt.figure(figsize=(12,7))
data = [87,13]
names = ['Correct Result','Wrong Result']
plt.title (' Accuracy ',color = 'black')
plt.pie (data,labels = names,labeldistance = 1.1,startangle = 90,colors = ['g','y'],autopct = '%1.0f%%',)
Out[188]:
([<matplotlib.patches.Wedge at 0x7faec567e560>,
  <matplotlib.patches.Wedge at 0x7faec567e4a0>],
 [Text(-0.4368626645752139, -1.0095300947967052, 'Correct Result'),
  Text(0.436862664575214, 1.009530094796705, 'Wrong Result')],
 [Text(-0.23828872613193483, -0.550652778980021, '87%'),
  Text(0.23828872613193489, 0.5506527789800209, '13%')])

RandomForestClassifier(90%)¶

In [189]:
RFC = RandomForestClassifier(n_estimators = 250,max_features = 30)
RFC.fit(x_train,y_train)
Out[189]:
RandomForestClassifier(max_features=30, n_estimators=250)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_features=30, n_estimators=250)
In [190]:
y_pred = RFC.predict(x_test)
y_pred
Out[190]:
array([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1])
In [191]:
y_test
Out[191]:
array([0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1])
In [192]:
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        20
           1       1.00      0.73      0.84        11

    accuracy                           0.90        31
   macro avg       0.93      0.86      0.89        31
weighted avg       0.92      0.90      0.90        31

In [193]:
plt.style.use ("dark_background")
CM = confusion_matrix (y_test,y_pred)
sns.heatmap(CM,annot=True,
            fmt='g',
            xticklabels=['CAD','Normal'],
            yticklabels=['CAD','Normal'])
print (CM)
[[20  0]
 [ 3  8]]
In [194]:
df_comp = pd.DataFrame({'Actual':y_test , 'Predict':y_pred})
df_comp
Out[194]:
Actual Predict
0 0 0
1 0 0
2 0 0
3 1 1
4 0 0
5 0 0
6 1 0
7 1 1
8 0 0
9 1 1
10 0 0
11 0 0
12 1 1
13 0 0
14 0 0
15 0 0
16 0 0
17 0 0
18 0 0
19 0 0
20 0 0
21 1 1
22 0 0
23 1 1
24 1 0
25 1 0
26 0 0
27 1 1
28 0 0
29 0 0
30 1 1
In [345]:
plt.title (' Actual & Predict ',color = 'r')
sns.heatmap(df_comp)
Out[345]:
<Axes: title={'center': ' Actual & Predict '}>
In [196]:
plt.style.use('default')
plt.figure(figsize=(12,7))
data = [90,10]
names = ['Correct Result','Wrong Result']
plt.title (' Accuracy ',color = 'black')
plt.pie (data,labels = names,labeldistance = 1.1,startangle = 90,colors = ['g','y'],autopct = '%1.0f%%',)
Out[196]:
([<matplotlib.patches.Wedge at 0x7faec5471db0>,
  <matplotlib.patches.Wedge at 0x7faec5471cf0>],
 [Text(-0.33991877217145866, -1.046162142464278, 'Correct Result'),
  Text(0.3399188456330338, 1.0461621185951564, 'Wrong Result')],
 [Text(-0.18541023936625015, -0.5706338958896061, '90%'),
  Text(0.18541027943620023, 0.5706338828700852, '10%')])

XGBoost (90%)¶

In [197]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic",learning_rate=0.01,max_depth=3,n_estimators=100,subsample=0.9,colsample_bytree=0.9)
xgb_model.fit(x_train,y_train)
Out[197]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
In [198]:
y_pred = xgb_model.predict(x_test)
y_pred
Out[198]:
array([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1])
In [199]:
y_test
Out[199]:
array([0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1])
In [200]:
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        20
           1       1.00      0.82      0.90        11

    accuracy                           0.94        31
   macro avg       0.95      0.91      0.93        31
weighted avg       0.94      0.94      0.93        31

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [201]:
CBC = CatBoostClassifier(iterations=100, learning_rate=0.01, depth=5, loss_function='Logloss',verbose=100)
CBC.fit(x_train,y_train)
0:	learn: 0.6852223	total: 3.07ms	remaining: 304ms
99:	learn: 0.3468872	total: 154ms	remaining: 0us
Out[201]:
<catboost.core.CatBoostClassifier at 0x7faec5448a30>
In [202]:
y_pred = CBC.predict(x_test)
y_pred
Out[202]:
array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1])
In [203]:
y_test
Out[203]:
array([0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1])
In [204]:
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        20
           1       1.00      0.73      0.84        11

    accuracy                           0.90        31
   macro avg       0.93      0.86      0.89        31
weighted avg       0.92      0.90      0.90        31

In [ ]:
 

Imbalanced Data¶

In [205]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(x_train, y_train)
In [206]:
CBC = CatBoostClassifier(iterations=100, learning_rate=0.01, depth=5, loss_function='Logloss',verbose=100)
CBC.fit(X_res,y_res)
0:	learn: 0.6865856	total: 1.7ms	remaining: 168ms
99:	learn: 0.3083666	total: 238ms	remaining: 0us
Out[206]:
<catboost.core.CatBoostClassifier at 0x7faec5421390>
In [207]:
y_pred = CBC.predict(x_test)
y_pred
Out[207]:
array([0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1])
In [208]:
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           0       0.90      0.95      0.93        20
           1       0.90      0.82      0.86        11

    accuracy                           0.90        31
   macro avg       0.90      0.88      0.89        31
weighted avg       0.90      0.90      0.90        31

In [ ]:
 
In [ ]:
 
In [209]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

class SMOTE2:
    
    def __init__(self, k_neighbors=5, random_state=None):
        self.k = k_neighbors
        self.rng = np.random.RandomState(random_state)
        
    def _generate_samples(self, X, y, minority_class):
        # find the k nearest neighbors of each minority class sample
        nn = NearestNeighbors(n_neighbors=self.k).fit(X)
        minority_indices = np.where(y == minority_class)[0]
        nn_distances, nn_indices = nn.kneighbors(X[minority_indices])
        
        # randomly choose neighbors and generate synthetic samples
        n_minority = len(minority_indices)
        n_synthetic = int(n_minority * self.ratio) - n_minority
        synthetic_indices = np.zeros((n_synthetic, X.shape[1]))
        
        for i in range(n_synthetic):
            j = self.rng.randint(n_minority)
            nn = nn_indices[j, self.rng.randint(1, self.k)]
            gap = self.rng.rand(X.shape[1])
            synthetic_indices[i] = X[minority_indices[j]] + gap * (X[nn] - X[minority_indices[j]])
        
        return synthetic_indices
        
    def fit_resample(self, X, y, ratio=1.0):
        self.ratio = ratio
        unique_classes = np.unique(y)
        X_resampled = np.copy(X)
        y_resampled = np.copy(y)
        
        for minority_class in unique_classes:
            if minority_class == np.argmax(np.bincount(y)):
                continue
            
            synthetic_indices = self._generate_samples(X, y, minority_class)
            X_resampled = np.vstack([X_resampled, synthetic_indices])
            y_resampled = np.hstack([y_resampled, np.full((len(synthetic_indices), ), minority_class)])
            
        return X_resampled, y_resampled
In [210]:
sm = SMOTE2(random_state=42)
X_res, y_res = sm.fit_resample(x_train, y_train.astype(int))
In [211]:
CBC = CatBoostClassifier(iterations=100, learning_rate=0.01, depth=5, loss_function='Logloss',verbose=100)
CBC.fit(X_res,y_res)
y_pred = CBC.predict(x_test)
y_pred
0:	learn: 0.6852223	total: 1.17ms	remaining: 116ms
99:	learn: 0.3468872	total: 150ms	remaining: 0us
Out[211]:
array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1])
In [212]:
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        20
           1       1.00      0.73      0.84        11

    accuracy                           0.90        31
   macro avg       0.93      0.86      0.89        31
weighted avg       0.92      0.90      0.90        31

In [213]:
RFC = RandomForestClassifier(n_estimators=250,max_features = 55,max_depth=2, random_state=42,max_leaf_nodes=15)
RFC.fit(X_res,y_res)
y_pred = RFC.predict(x_test)
y_pred
Out[213]:
array([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1])
In [214]:
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        20
           1       1.00      0.73      0.84        11

    accuracy                           0.90        31
   macro avg       0.93      0.86      0.89        31
weighted avg       0.92      0.90      0.90        31

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [108]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [215]:
from imblearn.combine import SMOTETomek
smote_tomek = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(x_train, y_train)
In [216]:
RFC = RandomForestClassifier(n_estimators=250,max_features = 55,max_depth=5, random_state=42,max_leaf_nodes=8)
RFC.fit(X_train_resampled,y_train_resampled)
y_pred = RFC.predict(x_test)
y_pred
Out[216]:
array([0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1])
In [217]:
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.90      0.90      0.90        20
           1       0.82      0.82      0.82        11

    accuracy                           0.87        31
   macro avg       0.86      0.86      0.86        31
weighted avg       0.87      0.87      0.87        31

In [ ]:
 
In [218]:
# train logistic regression model on resampled data
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)
lr.fit(X_train_resampled, y_train_resampled)

# evaluate model on testing data
y_pred = lr.predict(x_test)
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.81      0.65      0.72        20
           1       0.53      0.73      0.62        11

    accuracy                           0.68        31
   macro avg       0.67      0.69      0.67        31
weighted avg       0.71      0.68      0.68        31

/home/mahmoudragab/anaconda3/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [219]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(X_train_resampled, y_train_resampled)
y_pred=clf.predict(x_test)
In [220]:
y_pred
Out[220]:
array([0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1])
In [221]:
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.90      0.95      0.93        20
           1       0.90      0.82      0.86        11

    accuracy                           0.90        31
   macro avg       0.90      0.88      0.89        31
weighted avg       0.90      0.90      0.90        31

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [327]:
#**************************************************************************************************************
In [ ]:
 
In [331]:
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from collections import Counter
In [332]:
''# Generate a synthetic imbalanced dataset
x, y = make_classification(n_samples=1000, n_features=20, n_informative=2,
                           n_redundant=10, n_classes=2, weights=[0.99],
                           flip_y=0, random_state=1)
print("Before oversampling:", Counter(y))
''
Before oversampling: Counter({0: 990, 1: 10})
Out[332]:
''
In [333]:
# Apply SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(x, y)
print("After oversampling:", Counter(y_resampled))
After oversampling: Counter({0: 990, 1: 990})
In [334]:
x_train , x_test , y_train , y_test = train_test_split(X_resampled,y_resampled,test_size=0.1,random_state = 0)
In [ ]:
 
In [ ]:
 

XGBoost Classifier¶

In [335]:
xgb_model_2 = xgb.XGBClassifier(objective="binary:logistic",learning_rate=0.01,max_depth=3,n_estimators=100,subsample=0.9,colsample_bytree=0.9)
xgb_model_2.fit(x_train,y_train)
y_pred = xgb_model_2.predict(x_test)
y_pred
Out[335]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1])
In [336]:
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       1.00      0.93      0.96       100
           1       0.93      1.00      0.97        98

    accuracy                           0.96       198
   macro avg       0.97      0.97      0.96       198
weighted avg       0.97      0.96      0.96       198

In [337]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print("Model Accuracy =",acc*100,"%")
Model Accuracy = 96.46464646464646 %
In [339]:
cm = confusion_matrix(y_test, y_pred)
%matplotlib inline
# Plot confusion matrix
class_names = ['Cad','Normal']
df_cm = pd.DataFrame(cm, index = [i for i in class_names], columns = [i for i in class_names])
sns.heatmap(df_cm, annot = True)
cmap = plt.cm.Blues
plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
# Model Accuracy
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print("Model Accuracy =",acc*100,"%")
Model Accuracy = 96.46464646464646 %
In [ ]:
 

CatBoostClassifier¶

In [341]:
CBC = CatBoostClassifier(iterations=100, learning_rate=0.01, depth=5, loss_function='Logloss',verbose=100)
CBC.fit(x_train,y_train)
y_pred = CBC.predict(x_test)
y_pred
0:	learn: 0.6750508	total: 19.3ms	remaining: 1.91s
99:	learn: 0.1308255	total: 325ms	remaining: 0us
Out[341]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1])
In [342]:
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       1.00      0.95      0.97       100
           1       0.95      1.00      0.98        98

    accuracy                           0.97       198
   macro avg       0.98      0.97      0.97       198
weighted avg       0.98      0.97      0.97       198

In [343]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print("Model Accuracy =",acc*100,"%")
Model Accuracy = 97.47474747474747 %
In [344]:
cm = confusion_matrix(y_test, y_pred)
%matplotlib inline
# Plot confusion matrix
class_names = ['Cad','Normal']
df_cm = pd.DataFrame(cm, index = [i for i in class_names], columns = [i for i in class_names])
sns.heatmap(df_cm, annot = True)
cmap = plt.cm.Blues
plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
# Model Accuracy
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print("Model Accuracy =",acc*100,"%")
Model Accuracy = 97.47474747474747 %
In [346]:
df_comp = pd.DataFrame({'Actual':y_test , 'Predict':y_pred})
df_comp
Out[346]:
Actual Predict
0 0 0
1 0 0
2 0 0
3 0 0
4 0 0
... ... ...
193 1 1
194 1 1
195 0 0
196 1 1
197 1 1

198 rows × 2 columns

In [347]:
plt.title (' Actual & Predict ',color = 'r')
sns.heatmap(df_comp)
Out[347]:
<Axes: title={'center': ' Actual & Predict '}>
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [328]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
In [329]:
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00        98

    accuracy                           1.00       198
   macro avg       1.00      1.00      1.00       198
weighted avg       1.00      1.00      1.00       198

In [ ]: